import pandas as pd
import zipfile
# Specify the path to the zip file
zip_file_path = r"C:/Users/kisho/Downloads/acquisition_data.zip"
# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as z:
# Extract and read the specific CSV file inside the zip
with z.open('customer_acquisition_data.csv') as f:
data = pd.read_csv(f)
# Now print the first few rows of the DataFrame
print(data.head())
customer_id channel cost conversion_rate revenue 0 1 referral 8.320327 0.123145 4199 1 2 paid advertising 30.450327 0.016341 3410 2 3 email marketing 5.246263 0.043822 3164 3 4 social media 9.546326 0.167592 1520 4 5 referral 8.320327 0.123145 2419
#create histograms for distribution of acquisition cost
fig = px.histogram(data,
x="cost",
nbins=20,
title='Distribution of Acquisition Cost')
fig.show()
#Distribution of revenue
fig = px.histogram(data,
x="revenue",
nbins=20,
title='Distribution of Revenue')
fig.show()
#comparison of cost of acquisition by channel
cost_by_channel = data.groupby('channel')['cost'].mean().reset_index()
fig = px.bar(cost_by_channel,
x='channel',
y='cost',
title='Customer Acquisition Cost by Channel')
fig.show()
#So paid advertisement is the most expensive channel, and email marketing is the least expensive channel.
#Now let’s see which channels are most and least effective at converting customers
conversion_by_channel = data.groupby('channel')['conversion_rate'].mean().reset_index()
fig = px.bar(conversion_by_channel, x='channel',
y='conversion_rate',
title='Conversion Rate by Channel')
fig.show()
#most and least profitable channels in terms of generating revenue:
revenue_by_channel = data.groupby('channel')['revenue'].sum().reset_index()
fig = px.pie(revenue_by_channel,
values='revenue',
names='channel',
title='Total Revenue by Channel',
hole=0.6, color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()
#email marketing is the most profitable channel in terms of generating revenue.
#Now let’s calculate the return on investment (ROI) for each channel:
data['roi'] = data['revenue'] / data['cost']
roi_by_channel = data.groupby('channel')['roi'].mean().reset_index()
fig = px.bar(roi_by_channel,
x='channel',
y='roi', title='Return on Investment (ROI) by Channel')
fig.show()
#CLVT
data['cltv'] = (data['revenue'] - data['cost']) * data['conversion_rate'] / data['cost']
channel_cltv = data.groupby('channel')['cltv'].mean().reset_index()
fig = px.bar(channel_cltv, x='channel', y='cltv', color='channel',
title='Customer Lifetime Value by Channel')
fig.update_xaxes(title='Channel')
fig.update_yaxes(title='CLTV')
fig.show()
#So the customer lifetime value from Social Media and the referral channels is the highest.
#let’s compare the CLTV distributions of the social media and referral channels:
subset = data.loc[data['channel'].isin(['social media', 'referral'])]
fig = px.box(subset, x='channel', y='cltv', title='CLTV Distribution by Channel')
fig.update_xaxes(title='Channel')
fig.update_yaxes(title='CLTV')
fig.update_layout(legend_title='Channel')
fig.show()
#There’s not much difference,
#but the Customer Lifetime Value from the Social Media channel is slightly better than the referral channe
#summary
Customer lifetime value analysis is used to estimate the total value of customers
to the business over the lifetime of their relationship.
It helps companies determine how much to invest in customer acquisition and retention,
as well as identify the most valuable customers to prioritize for retention efforts